#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import warnings
warnings.filterwarnings('ignore')
#read data
df=pd.read_csv(r"C:\Users\sunil\Downloads\data.csv")
df
df.columns
#data overview
df.head()
#drop the unnamed and id columns.
#useless
df=df.drop(columns=['Unnamed: 32','id'])
#no of rows and columns
df.shape
#there is 569 rows and 31columns i.e 30 features and one target class
#lets check data types
df.dtypes
#all are numeric except target label 'diagnosis'
df.describe()
#check any null values in database
df.isnull().values.any()
#lets count class labels
df['diagnosis'].value_counts()
#Data visualization
#histogram
df.hist(bins=50,figsize=(15,15))
plt.show()
df.isna().sum()
#Scatter matrix to check correlation between two attributes
sns.pairplot(df)
plt.show()
#Count each label
ax=sns.countplot(y='diagnosis',data=df,palette='Set1')
#Get a count of the number of 'M' & 'B' cells
df['diagnosis'].value_counts()
#Visualize this count
sns.countplot(df['diagnosis'],label="Count")
#lets find correlation
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, fmt='.0%', cmap='Blues')
#box plot to check outlier in each category
#define function can be call later
def boxPlot(dff):
d=dff.drop(columns=['diagnosis'])
for column in d:
plt.figure(figsize=(5,2))
sns.boxplot(x=column,data=d,palette="colorblind")
boxPlot(df)
Q1=df.quantile(0.25)
Q3=df.quantile(0.75)
IQR=Q3-Q1
##---quartiles and IQR
print("Quartile 1:\n",Q1)
print("\nQuartile 3:\n",Q3)
print("\nIQR :\n",IQR)
#--display outlier
print((df<(Q1-1.5*IQR))|(df>(Q3+1.5*IQR)))
#remove all outlier
# < Q1-1.5*IQR
# > Q3+1.5*IQR
df_out = df[~((df < (Q1 - (1.5 * IQR))) |(df > (Q3 + (1.5 * IQR)))).any(axis=1)]
df.shape,df_out.shape
#good to go..
#Lets seprate labels and features
X=df_out.drop(columns=['diagnosis'])
y=df_out['diagnosis']
y
#--visualize again boxplot
boxPlot(df_out)
#Encoding categorical data values
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y
# Splitting the dataset into the Training set and Test setfrom sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#Fitting the Logistic Regression Algorithm to the Training Set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"Logistic Regression training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Logistic Regression testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred
cm = confusion_matrix(y_test, y_pred)
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm
#Fitting K-NN Algorithm
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)
#Accuracy
print(f"KNN training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"KNN Regression testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
#Fitting SVM
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"SVM training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"SVM testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
#Fitting K-SVM
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"K-SVM training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"K-SVM testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
#Fitting Naive_Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)
#Accuracy
print(f"Naive Bayes training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Naive Bayes testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
#Fitting Decision Tree Algorithm
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"Decision Tree training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Decision Tree testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
#Fitting Random Forest Classification Algorithm
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
#Accuracy
print(f"Random Forest training set classification score: {format(classifier.score(X_train, y_train), '.4f')} ")
print(f"Random Forest testing set classification score: {format(classifier.score(X_test, y_test), '.4f')} ")
#predicting the Test set results
y_pred = classifier.predict(X_test)
TN = cm[0][0]
TP = cm[1][1]
FN = cm[1][0]
FP = cm[0][1]
cm = confusion_matrix(y_test, y_pred)
cm
label = ['Logistic Regression','K-NN','SVM','K-SVM','Naive Bayes','Decision Tree','Random Forest']
accuracy=[0.97,0.96,0.95,0.96,0.95,0.92,0.94]
# this is for plotting purpose
index = np.arange(len(label))
plt.bar(index, accuracy,color=['k', 'r', 'g', 'b', 'c', 'y', 'm'])
plt.xlabel('Algorithms', fontsize=10)
plt.ylabel('Accuracy', fontsize=10)
plt.xticks(index, label, fontsize=10, rotation=90)
plt.title('Best suited classification model')
plt.show()